import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import zscore
import matplotlib.pyplot as pt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, roc_auc_score
from six import StringIO
from sklearn.tree import export_graphviz
import pydotplus, graphviz
from IPython.display import Image
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix
#from sklearn.metrics import classification_report,plot_confusion_matrix,accuracy_score
#!pip install graphviz
#brew install graphviz
#!conda install python-graphviz
#!pip install pydotplus
#conda install -c anaconda graphviz
data_train = pd.read_csv('/Users/huzaifkherani/Desktop/AML/Project/DATA/data.csv')
data_test = pd.read_csv('/Users/huzaifkherani/Desktop/AML/Project/DATA/test.csv')
data_train.head()
| Severity | Safety_Score | Days_Since_Inspection | Total_Safety_Complaints | Control_Metric | Turbulence_In_gforces | Cabin_Temperature | Accident_Type_Code | Max_Elevation | Violations | Adverse_Weather_Metric | Accident_ID | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Minor_Damage_And_Injuries | 49.223744 | 14 | 22 | 71.285324 | 0.272118 | 78.04 | 2 | 31335.476824 | 3 | 0.424352 | 7570 |
| 1 | Minor_Damage_And_Injuries | 62.465753 | 10 | 27 | 72.288058 | 0.423939 | 84.54 | 2 | 26024.711057 | 2 | 0.352350 | 12128 |
| 2 | Significant_Damage_And_Fatalities | 63.059361 | 13 | 16 | 66.362808 | 0.322604 | 78.86 | 7 | 39269.053927 | 3 | 0.003364 | 2181 |
| 3 | Significant_Damage_And_Serious_Injuries | 48.082192 | 11 | 9 | 74.703737 | 0.337029 | 81.79 | 3 | 42771.499200 | 1 | 0.211728 | 5946 |
| 4 | Significant_Damage_And_Fatalities | 26.484018 | 13 | 25 | 47.948952 | 0.541140 | 77.16 | 3 | 35509.228515 | 2 | 0.176883 | 9054 |
data_train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10000 entries, 0 to 9999 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Severity 10000 non-null object 1 Safety_Score 10000 non-null float64 2 Days_Since_Inspection 10000 non-null int64 3 Total_Safety_Complaints 10000 non-null int64 4 Control_Metric 10000 non-null float64 5 Turbulence_In_gforces 10000 non-null float64 6 Cabin_Temperature 10000 non-null float64 7 Accident_Type_Code 10000 non-null int64 8 Max_Elevation 10000 non-null float64 9 Violations 10000 non-null int64 10 Adverse_Weather_Metric 10000 non-null float64 11 Accident_ID 10000 non-null int64 dtypes: float64(6), int64(5), object(1) memory usage: 937.6+ KB
data_train.isnull().sum() # Checking if there is any null value in the dataset
Severity 0 Safety_Score 0 Days_Since_Inspection 0 Total_Safety_Complaints 0 Control_Metric 0 Turbulence_In_gforces 0 Cabin_Temperature 0 Accident_Type_Code 0 Max_Elevation 0 Violations 0 Adverse_Weather_Metric 0 Accident_ID 0 dtype: int64
data_train.drop(['Accident_ID'],axis=1,inplace=True)
# Drop the target and check how the features correlate
data_train.drop("Severity", axis=1).corr()
| Safety_Score | Days_Since_Inspection | Total_Safety_Complaints | Control_Metric | Turbulence_In_gforces | Cabin_Temperature | Accident_Type_Code | Max_Elevation | Violations | Adverse_Weather_Metric | |
|---|---|---|---|---|---|---|---|---|---|---|
| Safety_Score | 1.000000 | -0.685386 | 0.057726 | 0.000564 | 0.019603 | 0.032747 | 0.173930 | 0.004451 | 0.041735 | -0.107925 |
| Days_Since_Inspection | -0.685386 | 1.000000 | -0.032055 | -0.011963 | -0.001564 | -0.039140 | -0.024718 | 0.000183 | -0.016724 | 0.040804 |
| Total_Safety_Complaints | 0.057726 | -0.032055 | 1.000000 | -0.019665 | 0.066412 | 0.013590 | 0.034927 | 0.036855 | -0.019005 | -0.002713 |
| Control_Metric | 0.000564 | -0.011963 | -0.019665 | 1.000000 | -0.643285 | -0.008330 | 0.008385 | -0.028375 | -0.003284 | -0.028296 |
| Turbulence_In_gforces | 0.019603 | -0.001564 | 0.066412 | -0.643285 | 1.000000 | 0.010757 | -0.007565 | 0.047625 | 0.013171 | 0.039802 |
| Cabin_Temperature | 0.032747 | -0.039140 | 0.013590 | -0.008330 | 0.010757 | 1.000000 | 0.030682 | -0.009186 | 0.018619 | -0.026647 |
| Accident_Type_Code | 0.173930 | -0.024718 | 0.034927 | 0.008385 | -0.007565 | 0.030682 | 1.000000 | 0.019970 | 0.046379 | -0.739361 |
| Max_Elevation | 0.004451 | 0.000183 | 0.036855 | -0.028375 | 0.047625 | -0.009186 | 0.019970 | 1.000000 | -0.030513 | 0.173436 |
| Violations | 0.041735 | -0.016724 | -0.019005 | -0.003284 | 0.013171 | 0.018619 | 0.046379 | -0.030513 | 1.000000 | -0.021578 |
| Adverse_Weather_Metric | -0.107925 | 0.040804 | -0.002713 | -0.028296 | 0.039802 | -0.026647 | -0.739361 | 0.173436 | -0.021578 | 1.000000 |
# Checking Corelation
pt.figure(figsize = (15, 7))
pt.subplot(1, 2, 1)
pt.title("Train Data")
sns.heatmap(data_train.corr())
pt.savefig('Correlation Heatmap.png')
inspec = data_train[data_train["Days_Since_Inspection"] == 1]
inspec
| Severity | Safety_Score | Days_Since_Inspection | Total_Safety_Complaints | Control_Metric | Turbulence_In_gforces | Cabin_Temperature | Accident_Type_Code | Max_Elevation | Violations | Adverse_Weather_Metric | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 840 | Significant_Damage_And_Serious_Injuries | 58.675799 | 1 | 2 | 75.387420 | 0.336308 | 75.96 | 6 | 29146.687854 | 3 | 0.007526 |
| 2283 | Significant_Damage_And_Serious_Injuries | 58.493151 | 1 | 5 | 75.387420 | 0.245792 | 76.11 | 6 | 16149.317704 | 3 | 0.003679 |
| 2611 | Highly_Fatal_And_Damaging | 65.342466 | 1 | 19 | 77.848678 | 0.361191 | 79.65 | 2 | 31661.628810 | 2 | 0.429535 |
| 7903 | Highly_Fatal_And_Damaging | 65.342466 | 1 | 8 | 58.204193 | 0.312146 | 79.43 | 2 | 28183.323130 | 2 | 0.382453 |
| 8152 | Significant_Damage_And_Serious_Injuries | 58.493151 | 1 | 20 | 67.046490 | 0.409514 | 79.72 | 7 | 25135.851480 | 2 | 0.002300 |
pt.figure()
pt.xticks(rotation = 90)
sns.countplot(data_train['Severity'])
#pt.savefig('Severity vs Count graph.png')
/Users/huzaifkherani/opt/anaconda3/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
<AxesSubplot:xlabel='Severity', ylabel='count'>
# "Accident_Type_Code" and "Severity" are a Categorical variable hence, removing it
pt.figure(figsize=(16,6))
data_train.boxplot(column=['Safety_Score', 'Days_Since_Inspection', 'Total_Safety_Complaints', 'Control_Metric',
'Cabin_Temperature', 'Accident_Type_Code', 'Violations'])
#pt.savefig('Box plot 1.png')
<AxesSubplot:>
pt.figure(figsize=(12,6))
data_train.boxplot(column=['Max_Elevation'])
#pt.savefig('Boxplot 2.png')
<AxesSubplot:>
pt.figure(figsize=(12,6))
data_train.boxplot(column=['Turbulence_In_gforces', 'Adverse_Weather_Metric'])
#pt.savefig('Boxplot 3.png')
<AxesSubplot:>
data_train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10000 entries, 0 to 9999 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Severity 10000 non-null object 1 Safety_Score 10000 non-null float64 2 Days_Since_Inspection 10000 non-null int64 3 Total_Safety_Complaints 10000 non-null int64 4 Control_Metric 10000 non-null float64 5 Turbulence_In_gforces 10000 non-null float64 6 Cabin_Temperature 10000 non-null float64 7 Accident_Type_Code 10000 non-null int64 8 Max_Elevation 10000 non-null float64 9 Violations 10000 non-null int64 10 Adverse_Weather_Metric 10000 non-null float64 dtypes: float64(6), int64(4), object(1) memory usage: 859.5+ KB
data_num = pd.DataFrame(data_train, columns = data_train.columns[data_train.dtypes == 'float64'])
data_num.head()
| Safety_Score | Control_Metric | Turbulence_In_gforces | Cabin_Temperature | Max_Elevation | Adverse_Weather_Metric | |
|---|---|---|---|---|---|---|
| 0 | 49.223744 | 71.285324 | 0.272118 | 78.04 | 31335.476824 | 0.424352 |
| 1 | 62.465753 | 72.288058 | 0.423939 | 84.54 | 26024.711057 | 0.352350 |
| 2 | 63.059361 | 66.362808 | 0.322604 | 78.86 | 39269.053927 | 0.003364 |
| 3 | 48.082192 | 74.703737 | 0.337029 | 81.79 | 42771.499200 | 0.211728 |
| 4 | 26.484018 | 47.948952 | 0.541140 | 77.16 | 35509.228515 | 0.176883 |
# Applying zscore
data_num=data_num.apply(zscore)
data_num.head()
| Safety_Score | Control_Metric | Turbulence_In_gforces | Cabin_Temperature | Max_Elevation | Adverse_Weather_Metric | |
|---|---|---|---|---|---|---|
| 0 | 0.455303 | 0.516733 | -0.901749 | -0.699134 | -0.070649 | 0.442701 |
| 1 | 1.275888 | 0.601122 | 0.349922 | 1.656279 | -0.633736 | 0.253773 |
| 2 | 1.312673 | 0.102462 | -0.485516 | -0.401990 | 0.770528 | -0.661939 |
| 3 | 0.384562 | 0.804422 | -0.366593 | 0.659758 | 1.141883 | -0.115208 |
| 4 | -0.953841 | -1.447221 | 1.316177 | -1.018021 | 0.371883 | -0.206638 |
floats = data_num.columns[data_num.dtypes == 'float64']
for columns in floats:
indexNames_larger = data_num[data_num[columns]>3].index
indexNames_lesser = data_num[data_num[columns]<-3].index
# Delete these row indexes from dataFrame
data_num.drop(indexNames_larger , inplace=True)
data_num.drop(indexNames_lesser , inplace=True)
data_train.drop(indexNames_larger , inplace=True)
data_train.drop(indexNames_lesser , inplace=True)
data_num.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 9507 entries, 0 to 9999 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Safety_Score 9507 non-null float64 1 Control_Metric 9507 non-null float64 2 Turbulence_In_gforces 9507 non-null float64 3 Cabin_Temperature 9507 non-null float64 4 Max_Elevation 9507 non-null float64 5 Adverse_Weather_Metric 9507 non-null float64 dtypes: float64(6) memory usage: 519.9 KB
data_train.drop(data_train.columns[data_train.dtypes == 'float64'],axis=1,inplace=True)
data_train.head()
| Severity | Days_Since_Inspection | Total_Safety_Complaints | Accident_Type_Code | Violations | |
|---|---|---|---|---|---|
| 0 | Minor_Damage_And_Injuries | 14 | 22 | 2 | 3 |
| 1 | Minor_Damage_And_Injuries | 10 | 27 | 2 | 2 |
| 2 | Significant_Damage_And_Fatalities | 13 | 16 | 7 | 3 |
| 3 | Significant_Damage_And_Serious_Injuries | 11 | 9 | 3 | 1 |
| 4 | Significant_Damage_And_Fatalities | 13 | 25 | 3 | 2 |
for column in data_num.columns:
data_train[column]=data_num[column]
data_train.head()
| Severity | Days_Since_Inspection | Total_Safety_Complaints | Accident_Type_Code | Violations | Safety_Score | Control_Metric | Turbulence_In_gforces | Cabin_Temperature | Max_Elevation | Adverse_Weather_Metric | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Minor_Damage_And_Injuries | 14 | 22 | 2 | 3 | 0.455303 | 0.516733 | -0.901749 | -0.699134 | -0.070649 | 0.442701 |
| 1 | Minor_Damage_And_Injuries | 10 | 27 | 2 | 2 | 1.275888 | 0.601122 | 0.349922 | 1.656279 | -0.633736 | 0.253773 |
| 2 | Significant_Damage_And_Fatalities | 13 | 16 | 7 | 3 | 1.312673 | 0.102462 | -0.485516 | -0.401990 | 0.770528 | -0.661939 |
| 3 | Significant_Damage_And_Serious_Injuries | 11 | 9 | 3 | 1 | 0.384562 | 0.804422 | -0.366593 | 0.659758 | 1.141883 | -0.115208 |
| 4 | Significant_Damage_And_Fatalities | 13 | 25 | 3 | 2 | -0.953841 | -1.447221 | 1.316177 | -1.018021 | 0.371883 | -0.206638 |
data_train['Severity'].unique()
array(['Minor_Damage_And_Injuries', 'Significant_Damage_And_Fatalities',
'Significant_Damage_And_Serious_Injuries',
'Highly_Fatal_And_Damaging'], dtype=object)
encoder=LabelEncoder()
data_train['Severity']=encoder.fit_transform(data_train['Severity'])
data_train.head()
| Severity | Days_Since_Inspection | Total_Safety_Complaints | Accident_Type_Code | Violations | Safety_Score | Control_Metric | Turbulence_In_gforces | Cabin_Temperature | Max_Elevation | Adverse_Weather_Metric | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 14 | 22 | 2 | 3 | 0.455303 | 0.516733 | -0.901749 | -0.699134 | -0.070649 | 0.442701 |
| 1 | 1 | 10 | 27 | 2 | 2 | 1.275888 | 0.601122 | 0.349922 | 1.656279 | -0.633736 | 0.253773 |
| 2 | 2 | 13 | 16 | 7 | 3 | 1.312673 | 0.102462 | -0.485516 | -0.401990 | 0.770528 | -0.661939 |
| 3 | 3 | 11 | 9 | 3 | 1 | 0.384562 | 0.804422 | -0.366593 | 0.659758 | 1.141883 | -0.115208 |
| 4 | 2 | 13 | 25 | 3 | 2 | -0.953841 | -1.447221 | 1.316177 | -1.018021 | 0.371883 | -0.206638 |
# Checking the unique values for dependent Variable (Severity)
data_train.Severity.unique()
array([1, 2, 3, 0])
# Checking the Unique Values in Accident_Type_Code
data_train.Accident_Type_Code.unique()
array([2, 7, 3, 4, 1, 6, 5])
# Checking the Unqiue Values in Violations
data_train.Violations.unique()
array([3, 2, 1, 0, 4, 5])
# Checking the Unqiue Values in days since inspection
data_train.Days_Since_Inspection.unique()
array([14, 10, 13, 11, 15, 18, 5, 6, 12, 7, 8, 17, 9, 16, 20, 19, 21,
3, 4, 1, 22, 2, 23])
data_train['Severity'].describe()
# A description (4 level factor) on the severity of the crash
count 9507.000000 mean 1.416430 std 1.183658 min 0.000000 25% 0.000000 50% 1.000000 75% 3.000000 max 3.000000 Name: Severity, dtype: float64
data_train['Safety_Score'].describe()
# It gives a measure of how safe the plane was deemed to be.
count 9507.000000 mean -0.000848 std 0.983860 min -2.595013 25% -0.690688 50% -0.034219 75% 0.659034 max 2.925549 Name: Safety_Score, dtype: float64
data_train['Days_Since_Inspection'].describe()
# It gives measure of how long the plane without inspection before incident
count 9507.000000 mean 12.957926 std 3.514377 min 1.000000 25% 11.000000 50% 13.000000 75% 15.000000 max 23.000000 Name: Days_Since_Inspection, dtype: float64
data_train['Total_Safety_Complaints'].describe()
# No. of complaints from mechanics prior to accident.
count 9507.000000 mean 6.497633 std 6.886067 min 0.000000 25% 2.000000 50% 4.000000 75% 9.000000 max 54.000000 Name: Total_Safety_Complaints, dtype: float64
data_train['Control_Metric'].describe()
# An estimation of how much control the pilot had during the incident given the factors at play.
count 9507.000000 mean 0.022516 std 0.972893 min -2.966217 25% -0.662790 50% 0.052596 75% 0.697018 max 2.933317 Name: Control_Metric, dtype: float64
data_train['Turbulence_In_gforces'].describe()
# Recorded turbulence experienced at the time of accident.
count 9507.000000 mean -0.038054 std 0.942622 min -2.040443 25% -0.729310 50% -0.149557 75% 0.547633 max 2.960296 Name: Turbulence_In_gforces, dtype: float64
data_train['Cabin_Temperature'].describe()
# Last recorded temp before incident.
count 9507.000000 mean -0.030554 std 0.947306 min -1.894959 25% -0.731747 50% -0.162825 75% 0.551047 max 2.975310 Name: Cabin_Temperature, dtype: float64
data_train['Max_Elevation'].describe()
# Height from the ground in mts.
count 9507.000000 mean -0.034620 std 0.973693 min -2.984254 25% -0.674868 50% -0.032461 75% 0.620405 max 2.995312 Name: Max_Elevation, dtype: float64
data_train['Violations'].describe()
# Number of Violations aircraft received during inspection.
count 9507.000000 mean 2.011255 std 1.037271 min 0.000000 25% 1.000000 50% 2.000000 75% 3.000000 max 5.000000 Name: Violations, dtype: float64
sns.barplot(x = "Days_Since_Inspection", y = "Severity", data = data_train)
#pt.savefig('Days since inspection vs Severity.png')
<AxesSubplot:xlabel='Days_Since_Inspection', ylabel='Severity'>
#sns.barplot(x = "Safety_Score", y = "Severity", data = data_train)
sns.barplot(x = "Accident_Type_Code", y = "Severity", data = data_train)
#pt.savefig('Accident type code vs Severity.png')
<AxesSubplot:xlabel='Accident_Type_Code', ylabel='Severity'>
data_train['Total_Safety_Complaints'] = np.power(2, data_train['Total_Safety_Complaints'])
data_train['Days_Since_Inspection'] = np.power(2, data_train['Days_Since_Inspection'])
data_train['Safety_Score'] = np.power(2, data_train['Safety_Score'])
X=data_train.drop(['Severity'],axis=1)
y=data_train['Severity']
# Split dataset into training set and Validation set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1) # 90% training and 10% test
print(X_train.shape)
print(X_test.shape)
(8556, 10) (951, 10)
#making the instance
from sklearn.model_selection import GridSearchCV
model= DecisionTreeClassifier(random_state=1234)
#Hyper Parameters Set
param_grid = {'max_features': ['auto', 'sqrt', 'log2'],
'min_samples_split': [5,10,15,20,25,50,100],
'min_samples_leaf':[5,6,7,8,9,10,11],
'max_depth':[5,10,15,25,100],
'criterion':['gini','entropy']}
# Create grid search object
clf1 = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=5)
# Fit on data
best_clf_dt = clf1.fit(X_train, y_train)
#Predict
predictions = best_clf_dt.predict(X_test)
print("Accuracy", accuracy_score(y_test,predictions))
print("CLASSIFICATION - REPORT \n")
print("Confusion matrix \n",confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
Accuracy 0.7886435331230284
CLASSIFICATION - REPORT
Confusion matrix
[[233 24 6 21]
[ 17 189 9 21]
[ 12 11 126 11]
[ 24 32 13 202]]
precision recall f1-score support
0 0.81 0.82 0.82 284
1 0.74 0.80 0.77 236
2 0.82 0.79 0.80 160
3 0.79 0.75 0.77 271
accuracy 0.79 951
macro avg 0.79 0.79 0.79 951
weighted avg 0.79 0.79 0.79 951
clf1.best_estimator_
DecisionTreeClassifier(criterion='entropy', max_depth=15, max_features='auto',
min_samples_leaf=5, min_samples_split=20,
random_state=1234)
dot_data = StringIO()
export_graphviz(clf1.best_estimator_, out_file=dot_data, filled=True,rounded=True,
feature_names=X.columns,
class_names=['Highly_Fatal_And_Damaging','Significant_Damage_And_Serious_Injuries', 'Minor_Damage_And_Injuries','Significant_Damage_And_Fatalities' ])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.43098 to fit
from sklearn.ensemble import RandomForestClassifier
#making the instance
model= RandomForestClassifier(random_state=1234)
#Hyper Parameters Set
param_grid = {'criterion':['gini','entropy'],
'n_estimators':[1,2,3,4,5],
'min_samples_leaf':[1,2,3],
'min_samples_split':[3,4,5,6,7]}
# Create grid search object
clf = GridSearchCV(model, param_grid=param_grid, n_jobs=-1, cv=5)
# Fit on data
best_clf_rf = clf.fit(X_train, y_train)
#Predict
predictions = best_clf_rf.predict(X_test)
#Check Prediction Score
print("Accuracy of Random Forest: ",accuracy_score(y_test, predictions))
#Print Classification Report
print("Confusion matrix \n",confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
Accuracy of Random Forest: 0.8958990536277602
Confusion matrix
[[260 9 7 8]
[ 5 222 3 6]
[ 5 14 135 6]
[ 20 14 2 235]]
precision recall f1-score support
0 0.90 0.92 0.91 284
1 0.86 0.94 0.90 236
2 0.92 0.84 0.88 160
3 0.92 0.87 0.89 271
accuracy 0.90 951
macro avg 0.90 0.89 0.89 951
weighted avg 0.90 0.90 0.90 951
param_grid = {"n_estimators":[10,20,40,100],'max_depth':[3,4,5,6]}
gb_model = GradientBoostingClassifier()
grid = GridSearchCV(gb_model,param_grid)
grid.fit(X_train,y_train)
GridSearchCV(estimator=GradientBoostingClassifier(),
param_grid={'max_depth': [3, 4, 5, 6],
'n_estimators': [10, 20, 40, 100]})
grid.best_params_
{'max_depth': 6, 'n_estimators': 100}
predictions = grid.predict(X_test)
predictions
array([2, 0, 0, 2, 2, 3, 3, 0, 2, 0, 3, 3, 3, 0, 3, 3, 0, 3, 3, 2, 3, 0,
2, 0, 1, 2, 0, 0, 1, 3, 1, 0, 0, 1, 1, 0, 2, 0, 3, 1, 2, 2, 3, 0,
2, 3, 3, 0, 1, 2, 0, 1, 3, 0, 3, 1, 0, 2, 1, 1, 2, 2, 0, 1, 0, 3,
0, 2, 3, 0, 2, 1, 1, 2, 2, 0, 0, 0, 1, 2, 3, 1, 1, 2, 2, 3, 3, 3,
0, 1, 3, 0, 0, 3, 0, 0, 1, 0, 1, 3, 1, 1, 3, 0, 2, 1, 1, 2, 3, 1,
1, 3, 3, 0, 0, 2, 1, 3, 1, 1, 2, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 3,
0, 0, 3, 0, 0, 3, 1, 3, 3, 1, 3, 3, 3, 0, 3, 3, 0, 3, 1, 0, 0, 1,
1, 1, 1, 3, 1, 1, 0, 1, 1, 3, 2, 0, 1, 2, 1, 0, 3, 0, 3, 1, 2, 0,
3, 3, 0, 1, 0, 1, 0, 3, 0, 1, 1, 2, 3, 1, 0, 3, 0, 1, 0, 0, 1, 0,
0, 2, 3, 2, 3, 3, 1, 1, 1, 3, 2, 2, 3, 0, 0, 1, 3, 0, 3, 2, 1, 3,
2, 0, 3, 1, 1, 1, 0, 2, 0, 0, 1, 1, 1, 0, 2, 2, 3, 1, 1, 3, 3, 3,
2, 1, 0, 1, 2, 3, 1, 3, 3, 1, 0, 3, 1, 0, 0, 0, 1, 3, 3, 1, 3, 1,
2, 3, 1, 0, 1, 1, 0, 1, 2, 2, 2, 2, 2, 3, 3, 1, 2, 0, 1, 1, 1, 1,
3, 3, 0, 0, 3, 0, 1, 3, 3, 1, 3, 0, 3, 3, 3, 0, 0, 2, 2, 1, 0, 1,
1, 3, 2, 0, 0, 1, 3, 3, 3, 2, 0, 0, 0, 2, 0, 0, 3, 3, 3, 3, 2, 0,
3, 0, 1, 0, 0, 3, 0, 3, 2, 3, 3, 1, 1, 3, 1, 3, 0, 1, 0, 3, 0, 0,
1, 0, 3, 1, 1, 1, 3, 1, 2, 3, 0, 2, 1, 3, 1, 3, 1, 0, 0, 2, 0, 1,
0, 0, 3, 1, 2, 0, 3, 3, 3, 3, 0, 2, 0, 0, 3, 3, 1, 2, 3, 0, 3, 1,
1, 3, 1, 1, 3, 3, 1, 1, 0, 0, 0, 0, 1, 3, 0, 1, 3, 0, 0, 2, 2, 2,
0, 1, 1, 1, 0, 0, 0, 3, 1, 1, 2, 3, 1, 0, 0, 1, 3, 3, 2, 2, 3, 0,
1, 2, 0, 0, 1, 0, 3, 0, 1, 3, 2, 2, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0,
0, 0, 3, 3, 0, 3, 0, 2, 1, 3, 2, 0, 0, 0, 2, 0, 2, 2, 1, 1, 3, 0,
3, 3, 3, 1, 3, 0, 2, 1, 0, 3, 0, 2, 2, 0, 3, 3, 3, 1, 2, 3, 3, 2,
2, 3, 2, 2, 0, 0, 1, 1, 2, 0, 3, 1, 1, 0, 2, 0, 3, 3, 3, 3, 3, 2,
1, 3, 0, 1, 3, 1, 3, 3, 2, 1, 0, 3, 3, 3, 0, 1, 0, 3, 2, 0, 1, 2,
0, 0, 0, 0, 3, 3, 3, 1, 1, 2, 3, 3, 1, 1, 0, 3, 2, 3, 2, 3, 1, 3,
2, 3, 1, 0, 1, 0, 3, 2, 3, 3, 1, 1, 3, 3, 0, 0, 2, 1, 3, 3, 1, 1,
0, 0, 2, 2, 3, 2, 0, 0, 1, 0, 1, 2, 0, 3, 1, 1, 0, 2, 0, 0, 0, 0,
1, 0, 1, 0, 3, 1, 0, 2, 1, 1, 1, 1, 0, 0, 3, 3, 1, 1, 1, 2, 3, 0,
1, 3, 0, 2, 1, 2, 0, 0, 1, 3, 2, 2, 1, 1, 2, 0, 0, 0, 3, 1, 3, 2,
1, 0, 1, 3, 1, 3, 2, 3, 1, 1, 1, 3, 3, 2, 3, 2, 0, 1, 0, 3, 1, 3,
0, 2, 0, 0, 3, 0, 2, 0, 3, 2, 1, 2, 0, 0, 2, 0, 1, 0, 1, 2, 3, 2,
0, 3, 3, 1, 3, 3, 1, 0, 2, 0, 3, 1, 3, 3, 0, 2, 2, 2, 1, 1, 0, 0,
0, 2, 3, 2, 1, 3, 2, 3, 3, 3, 0, 3, 0, 0, 1, 3, 2, 1, 3, 3, 2, 1,
0, 0, 0, 3, 0, 3, 0, 0, 3, 2, 0, 1, 0, 1, 2, 1, 0, 3, 2, 2, 0, 0,
3, 0, 0, 0, 0, 2, 3, 1, 3, 1, 0, 2, 1, 3, 3, 0, 1, 3, 0, 3, 3, 3,
3, 1, 2, 3, 1, 0, 3, 1, 0, 1, 0, 1, 1, 2, 2, 3, 3, 3, 2, 1, 2, 0,
0, 0, 0, 0, 2, 2, 0, 3, 0, 2, 0, 3, 0, 2, 1, 0, 2, 0, 0, 0, 3, 3,
3, 1, 1, 0, 1, 3, 3, 1, 2, 3, 3, 2, 0, 3, 3, 2, 3, 2, 3, 0, 0, 1,
2, 0, 3, 3, 1, 2, 3, 3, 3, 0, 0, 0, 3, 1, 3, 0, 3, 2, 3, 3, 3, 0,
1, 1, 0, 1, 1, 1, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 3, 3, 0, 2, 1, 1,
1, 1, 1, 0, 1, 1, 0, 0, 2, 0, 3, 0, 1, 2, 2, 3, 0, 0, 1, 3, 3, 1,
3, 2, 1, 0, 3, 3, 3, 0, 0, 0, 0, 0, 1, 1, 0, 1, 2, 3, 1, 1, 1, 0,
1, 3, 3, 1, 0])
print(classification_report(y_test,predictions))
precision recall f1-score support
0 0.94 0.95 0.95 284
1 0.95 0.96 0.95 236
2 0.96 0.94 0.95 160
3 0.96 0.95 0.96 271
accuracy 0.95 951
macro avg 0.95 0.95 0.95 951
weighted avg 0.95 0.95 0.95 951
grid.best_estimator_.feature_importances_
array([0.2324318 , 0.00451244, 0.11729531, 0.00133324, 0.39295686,
0.20007292, 0.00886474, 0.0051295 , 0.00602159, 0.03138161])
# example of grid searching key hyperparameters for gradient boosting on a classification dataset
from sklearn.datasets import make_classification
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
# define dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)
# define the model with default hyperparameters
model = GradientBoostingClassifier()
# define the grid of values to search
grid = dict()
grid['n_estimators'] = [10, 50, 100, 500]
grid['learning_rate'] = [0.0001, 0.001, 0.01, 0.1, 1.0]
grid['subsample'] = [0.5, 0.7, 1.0]
grid['max_depth'] = [3, 7, 9]
# define the evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define the grid search procedure
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy')
# execute the grid search
grid_result = grid_search.fit(X, y)
# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# summarize all scores that were evaluated
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("%f (%f) with: %r" % (mean, stdev, param))
Best: 0.945333 using {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 500, 'subsample': 0.7}
0.529333 (0.089403) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 10, 'subsample': 0.5}
0.525000 (0.075840) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 10, 'subsample': 0.7}
0.524000 (0.072874) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 10, 'subsample': 1.0}
0.776333 (0.034687) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.5}
0.770667 (0.035957) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.7}
0.738667 (0.049982) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 50, 'subsample': 1.0}
0.831000 (0.032696) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.5}
0.814000 (0.038349) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.7}
0.761000 (0.043077) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}
0.831333 (0.037659) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 500, 'subsample': 0.5}
0.814667 (0.043261) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 500, 'subsample': 0.7}
0.773667 (0.034975) with: {'learning_rate': 0.0001, 'max_depth': 3, 'n_estimators': 500, 'subsample': 1.0}
0.537333 (0.112159) with: {'learning_rate': 0.0001, 'max_depth': 7, 'n_estimators': 10, 'subsample': 0.5}
0.535000 (0.105095) with: {'learning_rate': 0.0001, 'max_depth': 7, 'n_estimators': 10, 'subsample': 0.7}
0.531667 (0.095222) with: {'learning_rate': 0.0001, 'max_depth': 7, 'n_estimators': 10, 'subsample': 1.0}
0.841000 (0.030260) with: {'learning_rate': 0.0001, 'max_depth': 7, 'n_estimators': 50, 'subsample': 0.5}
0.841333 (0.033539) with: {'learning_rate': 0.0001, 'max_depth': 7, 'n_estimators': 50, 'subsample': 0.7}
0.804000 (0.033625) with: {'learning_rate': 0.0001, 'max_depth': 7, 'n_estimators': 50, 'subsample': 1.0}
0.870667 (0.031721) with: {'learning_rate': 0.0001, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.5}
0.868333 (0.031526) with: {'learning_rate': 0.0001, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.7}
0.809000 (0.029366) with: {'learning_rate': 0.0001, 'max_depth': 7, 'n_estimators': 100, 'subsample': 1.0}
0.887000 (0.031427) with: {'learning_rate': 0.0001, 'max_depth': 7, 'n_estimators': 500, 'subsample': 0.5}
0.881333 (0.029181) with: {'learning_rate': 0.0001, 'max_depth': 7, 'n_estimators': 500, 'subsample': 0.7}
0.810667 (0.032857) with: {'learning_rate': 0.0001, 'max_depth': 7, 'n_estimators': 500, 'subsample': 1.0}
0.534667 (0.104267) with: {'learning_rate': 0.0001, 'max_depth': 9, 'n_estimators': 10, 'subsample': 0.5}
0.535333 (0.106293) with: {'learning_rate': 0.0001, 'max_depth': 9, 'n_estimators': 10, 'subsample': 0.7}
0.531333 (0.094012) with: {'learning_rate': 0.0001, 'max_depth': 9, 'n_estimators': 10, 'subsample': 1.0}
0.833000 (0.033877) with: {'learning_rate': 0.0001, 'max_depth': 9, 'n_estimators': 50, 'subsample': 0.5}
0.841667 (0.026967) with: {'learning_rate': 0.0001, 'max_depth': 9, 'n_estimators': 50, 'subsample': 0.7}
0.808000 (0.029710) with: {'learning_rate': 0.0001, 'max_depth': 9, 'n_estimators': 50, 'subsample': 1.0}
0.874667 (0.029970) with: {'learning_rate': 0.0001, 'max_depth': 9, 'n_estimators': 100, 'subsample': 0.5}
0.877667 (0.029853) with: {'learning_rate': 0.0001, 'max_depth': 9, 'n_estimators': 100, 'subsample': 0.7}
0.812333 (0.026543) with: {'learning_rate': 0.0001, 'max_depth': 9, 'n_estimators': 100, 'subsample': 1.0}
0.892333 (0.027891) with: {'learning_rate': 0.0001, 'max_depth': 9, 'n_estimators': 500, 'subsample': 0.5}
0.885000 (0.029972) with: {'learning_rate': 0.0001, 'max_depth': 9, 'n_estimators': 500, 'subsample': 0.7}
0.812000 (0.027976) with: {'learning_rate': 0.0001, 'max_depth': 9, 'n_estimators': 500, 'subsample': 1.0}
0.813000 (0.033779) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 10, 'subsample': 0.5}
0.802000 (0.038070) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 10, 'subsample': 0.7}
0.761333 (0.043107) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 10, 'subsample': 1.0}
0.825333 (0.039474) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.5}
0.814667 (0.038534) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.7}
0.773667 (0.034975) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 50, 'subsample': 1.0}
0.834000 (0.036111) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.5}
0.813000 (0.038914) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.7}
0.781333 (0.034325) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}
0.846667 (0.033797) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 500, 'subsample': 0.5}
0.833333 (0.033300) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 500, 'subsample': 0.7}
0.814333 (0.032730) with: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 500, 'subsample': 1.0}
0.845000 (0.034132) with: {'learning_rate': 0.001, 'max_depth': 7, 'n_estimators': 10, 'subsample': 0.5}
0.852333 (0.029403) with: {'learning_rate': 0.001, 'max_depth': 7, 'n_estimators': 10, 'subsample': 0.7}
0.806667 (0.030912) with: {'learning_rate': 0.001, 'max_depth': 7, 'n_estimators': 10, 'subsample': 1.0}
0.881667 (0.031632) with: {'learning_rate': 0.001, 'max_depth': 7, 'n_estimators': 50, 'subsample': 0.5}
0.875000 (0.030957) with: {'learning_rate': 0.001, 'max_depth': 7, 'n_estimators': 50, 'subsample': 0.7}
0.810667 (0.033059) with: {'learning_rate': 0.001, 'max_depth': 7, 'n_estimators': 50, 'subsample': 1.0}
0.886333 (0.033315) with: {'learning_rate': 0.001, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.5}
0.877333 (0.030214) with: {'learning_rate': 0.001, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.7}
0.808333 (0.033475) with: {'learning_rate': 0.001, 'max_depth': 7, 'n_estimators': 100, 'subsample': 1.0}
0.895333 (0.028371) with: {'learning_rate': 0.001, 'max_depth': 7, 'n_estimators': 500, 'subsample': 0.5}
0.884333 (0.030186) with: {'learning_rate': 0.001, 'max_depth': 7, 'n_estimators': 500, 'subsample': 0.7}
0.822000 (0.024276) with: {'learning_rate': 0.001, 'max_depth': 7, 'n_estimators': 500, 'subsample': 1.0}
0.851333 (0.025263) with: {'learning_rate': 0.001, 'max_depth': 9, 'n_estimators': 10, 'subsample': 0.5}
0.862000 (0.032802) with: {'learning_rate': 0.001, 'max_depth': 9, 'n_estimators': 10, 'subsample': 0.7}
0.812333 (0.028365) with: {'learning_rate': 0.001, 'max_depth': 9, 'n_estimators': 10, 'subsample': 1.0}
0.889667 (0.031568) with: {'learning_rate': 0.001, 'max_depth': 9, 'n_estimators': 50, 'subsample': 0.5}
0.881000 (0.029704) with: {'learning_rate': 0.001, 'max_depth': 9, 'n_estimators': 50, 'subsample': 0.7}
0.812000 (0.027857) with: {'learning_rate': 0.001, 'max_depth': 9, 'n_estimators': 50, 'subsample': 1.0}
0.892667 (0.031826) with: {'learning_rate': 0.001, 'max_depth': 9, 'n_estimators': 100, 'subsample': 0.5}
0.884000 (0.029620) with: {'learning_rate': 0.001, 'max_depth': 9, 'n_estimators': 100, 'subsample': 0.7}
0.812000 (0.026128) with: {'learning_rate': 0.001, 'max_depth': 9, 'n_estimators': 100, 'subsample': 1.0}
0.902000 (0.027129) with: {'learning_rate': 0.001, 'max_depth': 9, 'n_estimators': 500, 'subsample': 0.5}
0.889000 (0.027123) with: {'learning_rate': 0.001, 'max_depth': 9, 'n_estimators': 500, 'subsample': 0.7}
0.818667 (0.028952) with: {'learning_rate': 0.001, 'max_depth': 9, 'n_estimators': 500, 'subsample': 1.0}
0.823000 (0.028885) with: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 10, 'subsample': 0.5}
0.808000 (0.039107) with: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 10, 'subsample': 0.7}
0.780000 (0.035214) with: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 10, 'subsample': 1.0}
0.844333 (0.035934) with: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.5}
0.836333 (0.036008) with: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.7}
0.813333 (0.032283) with: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50, 'subsample': 1.0}
0.857000 (0.030348) with: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.5}
0.847000 (0.031849) with: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.7}
0.836000 (0.034020) with: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}
0.899000 (0.030039) with: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500, 'subsample': 0.5}
0.890000 (0.030000) with: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500, 'subsample': 0.7}
0.878333 (0.029107) with: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500, 'subsample': 1.0}
0.870333 (0.024964) with: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 10, 'subsample': 0.5}
0.865667 (0.030734) with: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 10, 'subsample': 0.7}
0.808667 (0.029970) with: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 10, 'subsample': 1.0}
0.895667 (0.026164) with: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 50, 'subsample': 0.5}
0.880333 (0.031250) with: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 50, 'subsample': 0.7}
0.822667 (0.023795) with: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 50, 'subsample': 1.0}
0.898000 (0.031979) with: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.5}
0.891000 (0.032797) with: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.7}
0.837000 (0.025968) with: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 100, 'subsample': 1.0}
0.925000 (0.026045) with: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 500, 'subsample': 0.5}
0.920667 (0.025940) with: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 500, 'subsample': 0.7}
0.887000 (0.031953) with: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 500, 'subsample': 1.0}
0.865667 (0.025519) with: {'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 10, 'subsample': 0.5}
0.865333 (0.030192) with: {'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 10, 'subsample': 0.7}
0.812000 (0.027857) with: {'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 10, 'subsample': 1.0}
0.895667 (0.032831) with: {'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 50, 'subsample': 0.5}
0.886333 (0.031462) with: {'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 50, 'subsample': 0.7}
0.816667 (0.030037) with: {'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 50, 'subsample': 1.0}
0.901000 (0.030039) with: {'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 100, 'subsample': 0.5}
0.894667 (0.024322) with: {'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 100, 'subsample': 0.7}
0.819333 (0.028394) with: {'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 100, 'subsample': 1.0}
0.921667 (0.024642) with: {'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 500, 'subsample': 0.5}
0.919000 (0.027851) with: {'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 500, 'subsample': 0.7}
0.830667 (0.027681) with: {'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 500, 'subsample': 1.0}
0.840333 (0.034106) with: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 10, 'subsample': 0.5}
0.840667 (0.025682) with: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 10, 'subsample': 0.7}
0.830000 (0.036515) with: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 10, 'subsample': 1.0}
0.887000 (0.029343) with: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.5}
0.886667 (0.031658) with: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.7}
0.880333 (0.033713) with: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50, 'subsample': 1.0}
0.907000 (0.031744) with: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.5}
0.900667 (0.030104) with: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.7}
0.899000 (0.031236) with: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}
0.926000 (0.026907) with: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500, 'subsample': 0.5}
0.927000 (0.026975) with: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500, 'subsample': 0.7}
0.919000 (0.025475) with: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500, 'subsample': 1.0}
0.870667 (0.030869) with: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 10, 'subsample': 0.5}
0.869667 (0.028575) with: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 10, 'subsample': 0.7}
0.835333 (0.023055) with: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 10, 'subsample': 1.0}
0.916000 (0.026533) with: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 50, 'subsample': 0.5}
0.912000 (0.030485) with: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 50, 'subsample': 0.7}
0.879000 (0.033501) with: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 50, 'subsample': 1.0}
0.926333 (0.026011) with: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.5}
0.927667 (0.027164) with: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.7}
0.905667 (0.032113) with: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100, 'subsample': 1.0}
0.944333 (0.022462) with: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 500, 'subsample': 0.5}
0.944333 (0.024857) with: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 500, 'subsample': 0.7}
0.926000 (0.025768) with: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 500, 'subsample': 1.0}
0.869333 (0.030761) with: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 10, 'subsample': 0.5}
0.878000 (0.024685) with: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 10, 'subsample': 0.7}
0.823333 (0.027968) with: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 10, 'subsample': 1.0}
0.917000 (0.031107) with: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 50, 'subsample': 0.5}
0.904000 (0.030725) with: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 50, 'subsample': 0.7}
0.845667 (0.028482) with: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 50, 'subsample': 1.0}
0.928667 (0.027415) with: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 100, 'subsample': 0.5}
0.932667 (0.031298) with: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 100, 'subsample': 0.7}
0.847667 (0.039806) with: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 100, 'subsample': 1.0}
0.939667 (0.026392) with: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 500, 'subsample': 0.5}
0.945333 (0.021715) with: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 500, 'subsample': 0.7}
0.858667 (0.047027) with: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 500, 'subsample': 1.0}
0.824667 (0.030739) with: {'learning_rate': 1.0, 'max_depth': 3, 'n_estimators': 10, 'subsample': 0.5}
0.842000 (0.039446) with: {'learning_rate': 1.0, 'max_depth': 3, 'n_estimators': 10, 'subsample': 0.7}
0.844667 (0.032014) with: {'learning_rate': 1.0, 'max_depth': 3, 'n_estimators': 10, 'subsample': 1.0}
0.823333 (0.036086) with: {'learning_rate': 1.0, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.5}
0.869000 (0.036729) with: {'learning_rate': 1.0, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.7}
0.896333 (0.033812) with: {'learning_rate': 1.0, 'max_depth': 3, 'n_estimators': 50, 'subsample': 1.0}
0.832667 (0.042734) with: {'learning_rate': 1.0, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.5}
0.876000 (0.040464) with: {'learning_rate': 1.0, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.7}
0.905000 (0.032326) with: {'learning_rate': 1.0, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}
0.807667 (0.064945) with: {'learning_rate': 1.0, 'max_depth': 3, 'n_estimators': 500, 'subsample': 0.5}
0.903000 (0.034269) with: {'learning_rate': 1.0, 'max_depth': 3, 'n_estimators': 500, 'subsample': 0.7}
0.922667 (0.027195) with: {'learning_rate': 1.0, 'max_depth': 3, 'n_estimators': 500, 'subsample': 1.0}
0.805667 (0.041608) with: {'learning_rate': 1.0, 'max_depth': 7, 'n_estimators': 10, 'subsample': 0.5}
0.847333 (0.036600) with: {'learning_rate': 1.0, 'max_depth': 7, 'n_estimators': 10, 'subsample': 0.7}
0.875667 (0.026418) with: {'learning_rate': 1.0, 'max_depth': 7, 'n_estimators': 10, 'subsample': 1.0}
0.832000 (0.045417) with: {'learning_rate': 1.0, 'max_depth': 7, 'n_estimators': 50, 'subsample': 0.5}
0.901667 (0.029107) with: {'learning_rate': 1.0, 'max_depth': 7, 'n_estimators': 50, 'subsample': 0.7}
0.918000 (0.029710) with: {'learning_rate': 1.0, 'max_depth': 7, 'n_estimators': 50, 'subsample': 1.0}
0.825333 (0.056729) with: {'learning_rate': 1.0, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.5}
0.915667 (0.027771) with: {'learning_rate': 1.0, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.7}
0.916333 (0.034301) with: {'learning_rate': 1.0, 'max_depth': 7, 'n_estimators': 100, 'subsample': 1.0}
0.779333 (0.144751) with: {'learning_rate': 1.0, 'max_depth': 7, 'n_estimators': 500, 'subsample': 0.5}
0.910667 (0.034731) with: {'learning_rate': 1.0, 'max_depth': 7, 'n_estimators': 500, 'subsample': 0.7}
0.923000 (0.032265) with: {'learning_rate': 1.0, 'max_depth': 7, 'n_estimators': 500, 'subsample': 1.0}
0.805333 (0.046385) with: {'learning_rate': 1.0, 'max_depth': 9, 'n_estimators': 10, 'subsample': 0.5}
0.854667 (0.026297) with: {'learning_rate': 1.0, 'max_depth': 9, 'n_estimators': 10, 'subsample': 0.7}
0.881667 (0.028412) with: {'learning_rate': 1.0, 'max_depth': 9, 'n_estimators': 10, 'subsample': 1.0}
0.840333 (0.048955) with: {'learning_rate': 1.0, 'max_depth': 9, 'n_estimators': 50, 'subsample': 0.5}
0.912000 (0.033106) with: {'learning_rate': 1.0, 'max_depth': 9, 'n_estimators': 50, 'subsample': 0.7}
0.917000 (0.026096) with: {'learning_rate': 1.0, 'max_depth': 9, 'n_estimators': 50, 'subsample': 1.0}
0.804333 (0.120324) with: {'learning_rate': 1.0, 'max_depth': 9, 'n_estimators': 100, 'subsample': 0.5}
0.909000 (0.028208) with: {'learning_rate': 1.0, 'max_depth': 9, 'n_estimators': 100, 'subsample': 0.7}
0.918333 (0.024506) with: {'learning_rate': 1.0, 'max_depth': 9, 'n_estimators': 100, 'subsample': 1.0}
0.814667 (0.144770) with: {'learning_rate': 1.0, 'max_depth': 9, 'n_estimators': 500, 'subsample': 0.5}
0.910667 (0.031826) with: {'learning_rate': 1.0, 'max_depth': 9, 'n_estimators': 500, 'subsample': 0.7}
0.918333 (0.025309) with: {'learning_rate': 1.0, 'max_depth': 9, 'n_estimators': 500, 'subsample': 1.0}
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
#Pipeline
pipe_XGB = Pipeline([('XGB', XGBClassifier())])
#Parameter-grid
param_grid = {'XGB__learning_rate':[0.1,0.2],'XGB__max_depth' :[5,10], 'XGB__gamma':[0.1,0.3]}
#Using RandomSearchCV
Random_XGB = RandomizedSearchCV( pipe_XGB , param_distributions=param_grid, cv= 10, n_iter=3)
#Fitting the data in the model
Random_XGB.fit(X_train, y_train)
print(" Best cross-validation score obtained is: {:.2f}". format( Random_XGB.best_score_))
print(" Best parameters as part of Gridsearch is: ", Random_XGB.best_params_)
print(" Train set score obtained is: {:.2f}". format( Random_XGB.score( X_train, y_train)))
print(" Test set score obtained is: {:.2f}". format( Random_XGB.score( X_test, y_test)))
Best cross-validation score obtained is: 0.96
Best parameters as part of Gridsearch is: {'XGB__max_depth': 10, 'XGB__learning_rate': 0.2, 'XGB__gamma': 0.1}
Train set score obtained is: 1.00
Test set score obtained is: 0.95
y_pred=Random_XGB.predict(X_test)
accuracy_score=metrics.accuracy_score(y_test,y_pred)
percision_score=metrics.precision_score(y_test,y_pred,average='macro')
recall_score=metrics.recall_score(y_test,y_pred,average='macro')
f1_score=metrics.f1_score(y_test,y_pred,average='macro')
print("The Accuracy of this model is {0:.2f}%".format(accuracy_score*100))
print("The Percision of this model is {0:.2f}%".format(percision_score*100))
print("The Recall score of this model is {0:.2f}%".format(recall_score*100))
print("The f1 score of this model is {0:.2f}%".format(f1_score*100))
The Accuracy of this model is 95.48% The Percision of this model is 95.39% The Recall score of this model is 95.45% The f1 score of this model is 95.42%
Random_XGB.cv_results_
{'mean_fit_time': array([0.85219514, 1.55653679, 0.94187734]),
'std_fit_time': array([0.06057696, 0.08860524, 0.14080043]),
'mean_score_time': array([0.0035188 , 0.00447943, 0.00354114]),
'std_score_time': array([0.00052528, 0.00045283, 0.00053228]),
'param_XGB__max_depth': masked_array(data=[5, 10, 5],
mask=[False, False, False],
fill_value='?',
dtype=object),
'param_XGB__learning_rate': masked_array(data=[0.1, 0.2, 0.2],
mask=[False, False, False],
fill_value='?',
dtype=object),
'param_XGB__gamma': masked_array(data=[0.3, 0.1, 0.1],
mask=[False, False, False],
fill_value='?',
dtype=object),
'params': [{'XGB__max_depth': 5,
'XGB__learning_rate': 0.1,
'XGB__gamma': 0.3},
{'XGB__max_depth': 10, 'XGB__learning_rate': 0.2, 'XGB__gamma': 0.1},
{'XGB__max_depth': 5, 'XGB__learning_rate': 0.2, 'XGB__gamma': 0.1}],
'split0_test_score': array([0.94392523, 0.96028037, 0.95093458]),
'split1_test_score': array([0.94158879, 0.9521028 , 0.94859813]),
'split2_test_score': array([0.95443925, 0.9614486 , 0.95911215]),
'split3_test_score': array([0.9567757 , 0.97196262, 0.9567757 ]),
'split4_test_score': array([0.95560748, 0.95794393, 0.95794393]),
'split5_test_score': array([0.94976636, 0.96378505, 0.95327103]),
'split6_test_score': array([0.94853801, 0.95789474, 0.95906433]),
'split7_test_score': array([0.95555556, 0.9754386 , 0.96608187]),
'split8_test_score': array([0.94269006, 0.96023392, 0.94736842]),
'split9_test_score': array([0.94736842, 0.95204678, 0.94853801]),
'mean_test_score': array([0.94962549, 0.96131374, 0.95476881]),
'std_test_score': array([0.00544726, 0.00717658, 0.00572731]),
'rank_test_score': array([3, 1, 2], dtype=int32)}
data_test.drop(['Accident_ID'],axis=1,inplace=True)
data_test.head()
| Safety_Score | Days_Since_Inspection | Total_Safety_Complaints | Control_Metric | Turbulence_In_gforces | Cabin_Temperature | Accident_Type_Code | Max_Elevation | Violations | Adverse_Weather_Metric | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 19.497717 | 16 | 6 | 72.151322 | 0.388959 | 78.32 | 4 | 37949.724386 | 2 | 0.069692 |
| 1 | 58.173516 | 15 | 3 | 64.585232 | 0.250841 | 78.60 | 7 | 30194.805567 | 2 | 0.002777 |
| 2 | 33.287671 | 15 | 3 | 64.721969 | 0.336669 | 86.96 | 6 | 17572.925484 | 1 | 0.004316 |
| 3 | 3.287671 | 21 | 5 | 66.362808 | 0.421775 | 80.86 | 3 | 40209.186341 | 2 | 0.199990 |
| 4 | 10.867580 | 18 | 2 | 56.107566 | 0.313228 | 79.22 | 2 | 35495.525408 | 2 | 0.483696 |
data_test.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2500 entries, 0 to 2499 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Safety_Score 2500 non-null float64 1 Days_Since_Inspection 2500 non-null int64 2 Total_Safety_Complaints 2500 non-null int64 3 Control_Metric 2500 non-null float64 4 Turbulence_In_gforces 2500 non-null float64 5 Cabin_Temperature 2500 non-null float64 6 Accident_Type_Code 2500 non-null int64 7 Max_Elevation 2500 non-null float64 8 Violations 2500 non-null int64 9 Adverse_Weather_Metric 2500 non-null float64 dtypes: float64(6), int64(4) memory usage: 195.4 KB
num = pd.DataFrame(data_test, columns =data_test.columns[data_test.dtypes == 'float64'])
num.head()
| Safety_Score | Control_Metric | Turbulence_In_gforces | Cabin_Temperature | Max_Elevation | Adverse_Weather_Metric | |
|---|---|---|---|---|---|---|
| 0 | 19.497717 | 72.151322 | 0.388959 | 78.32 | 37949.724386 | 0.069692 |
| 1 | 58.173516 | 64.585232 | 0.250841 | 78.60 | 30194.805567 | 0.002777 |
| 2 | 33.287671 | 64.721969 | 0.336669 | 86.96 | 17572.925484 | 0.004316 |
| 3 | 3.287671 | 66.362808 | 0.421775 | 80.86 | 40209.186341 | 0.199990 |
| 4 | 10.867580 | 56.107566 | 0.313228 | 79.22 | 35495.525408 | 0.483696 |
num=num.apply(zscore)
data_test.drop(data_test.columns[data_test.dtypes == 'float64'],axis=1,inplace=True)
data_test.head()
| Days_Since_Inspection | Total_Safety_Complaints | Accident_Type_Code | Violations | |
|---|---|---|---|---|
| 0 | 16 | 6 | 4 | 2 |
| 1 | 15 | 3 | 7 | 2 |
| 2 | 15 | 3 | 6 | 1 |
| 3 | 21 | 5 | 3 | 2 |
| 4 | 18 | 2 | 2 | 2 |
for column in num.columns:
data_test[column]=num[column]
data_test.head()
| Days_Since_Inspection | Total_Safety_Complaints | Accident_Type_Code | Violations | Safety_Score | Control_Metric | Turbulence_In_gforces | Cabin_Temperature | Max_Elevation | Adverse_Weather_Metric | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 16 | 6 | 4 | 2 | -1.371727 | 0.592957 | 0.109134 | -0.616620 | 0.586995 | -0.467493 |
| 1 | 15 | 3 | 7 | 2 | 1.004384 | -0.068431 | -1.071998 | -0.513424 | -0.230758 | -0.640138 |
| 2 | 15 | 3 | 6 | 1 | -0.524519 | -0.056478 | -0.338031 | 2.567706 | -1.561731 | -0.636168 |
| 3 | 21 | 5 | 3 | 2 | -2.367618 | 0.086956 | 0.389769 | 0.319513 | 0.825254 | -0.131314 |
| 4 | 18 | 2 | 2 | 2 | -1.901934 | -0.809504 | -0.538484 | -0.284919 | 0.328201 | 0.600666 |
data_test['Total_Safety_Complaints'] = np.power(2, data_test['Total_Safety_Complaints'])
data_test['Days_Since_Inspection'] = np.power(2, data_test['Days_Since_Inspection'])
data_test['Safety_Score'] = np.power(2, data_test['Safety_Score'])
testPredictions=Random_XGB.predict(data_test)
data_test['Severity']=encoder.inverse_transform(testPredictions)
data_test.head()
| Days_Since_Inspection | Total_Safety_Complaints | Accident_Type_Code | Violations | Safety_Score | Control_Metric | Turbulence_In_gforces | Cabin_Temperature | Max_Elevation | Adverse_Weather_Metric | Severity | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 65536 | 64 | 4 | 2 | 0.386428 | 0.592957 | 0.109134 | -0.616620 | 0.586995 | -0.467493 | Highly_Fatal_And_Damaging |
| 1 | 32768 | 8 | 7 | 2 | 2.006087 | -0.068431 | -1.071998 | -0.513424 | -0.230758 | -0.640138 | Significant_Damage_And_Fatalities |
| 2 | 32768 | 8 | 6 | 1 | 0.695191 | -0.056478 | -0.338031 | 2.567706 | -1.561731 | -0.636168 | Significant_Damage_And_Serious_Injuries |
| 3 | 2097152 | 32 | 3 | 2 | 0.193765 | 0.086956 | 0.389769 | 0.319513 | 0.825254 | -0.131314 | Highly_Fatal_And_Damaging |
| 4 | 262144 | 4 | 2 | 2 | 0.267584 | -0.809504 | -0.538484 | -0.284919 | 0.328201 | 0.600666 | Significant_Damage_And_Fatalities |
final_test = pd.read_csv('/Users/huzaifkherani/Desktop/AML/Project/DATA/test.csv')
final_test['Severity']=data_test['Severity']
final_test.head()
| Safety_Score | Days_Since_Inspection | Total_Safety_Complaints | Control_Metric | Turbulence_In_gforces | Cabin_Temperature | Accident_Type_Code | Max_Elevation | Violations | Adverse_Weather_Metric | Accident_ID | Severity | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 19.497717 | 16 | 6 | 72.151322 | 0.388959 | 78.32 | 4 | 37949.724386 | 2 | 0.069692 | 1 | Highly_Fatal_And_Damaging |
| 1 | 58.173516 | 15 | 3 | 64.585232 | 0.250841 | 78.60 | 7 | 30194.805567 | 2 | 0.002777 | 10 | Significant_Damage_And_Fatalities |
| 2 | 33.287671 | 15 | 3 | 64.721969 | 0.336669 | 86.96 | 6 | 17572.925484 | 1 | 0.004316 | 14 | Significant_Damage_And_Serious_Injuries |
| 3 | 3.287671 | 21 | 5 | 66.362808 | 0.421775 | 80.86 | 3 | 40209.186341 | 2 | 0.199990 | 17 | Highly_Fatal_And_Damaging |
| 4 | 10.867580 | 18 | 2 | 56.107566 | 0.313228 | 79.22 | 2 | 35495.525408 | 2 | 0.483696 | 21 | Significant_Damage_And_Fatalities |